#!/usr/bin/perl
##---------------------------------------------------------------------------##
##  File:
##      @(#) RepeatProteinMask
##  Author:
##      Arian Smit <asmit@systemsbiology.org>
##      Robert Hubley <rhubley@systemsbiology.org>
##  Description:
##      Given a sequence file and a protein database, mask
##      all hits using blastx.
##
#******************************************************************************
#* Copyright (C) Institute for Systems Biology 2005-2019 Developed by
#* Arian Smit and Robert Hubley.
#*
#* This work is licensed under the Open Source License v2.1.  To view a copy
#* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or
#* see the license.txt file contained in this distribution.
#*
###############################################################################
#

=head1 NAME

RepeatProteinMask - Mask Repeat Proteins in DNA sequence

=head1 SYNOPSIS

  RepeatProteinMask [-engine ncbi|abblast|wublast] [-pvalue #] 
                    [-minscore #] [-wordsize #] [-maxAADist] 
                    [-noLowSimple] [-queryStatLen #] <fasta file>

=head1 DESCRIPTION

The options are:

=over 4

=item -h(elp)

=item -engine ncbi|abblast|wublast

Use the provided search engine to run the blastx runs.

=item -pvalue #

The threshold for accepting matches. Matches must hava a pvalue
less than this number. Default is *NOT* to threshold.

=item -minscore #

Threshold on minscore.  Note no default is added. So all hits 
will be returned unless a minscore is used.

=item -wordsize #

The wordsize to use with the blastx search. Default is 3

=item -querystatlen #

The effective length of the query to use in statistical calculations.

=item -maxaadist #

The maximum distance to consider two blastx hits as the same
(and thus contributing to Sum P scores).  Default is 333.

=item -noLowSimple

Turns off masking/annotating of low complexity and simple repeats
in the final output.  Low complexity and simple repeat analysis 
will still occur prior to looking for matches to the RepeatPep
database.

=back

=head1 CONFIGURATION OVERRIDES

=head1 SEE ALSO

=over 4

RepeatModeler

=back

=head1 COPYRIGHT

Copyright 2005-2019 Institute for Systems Biology

=head1 AUTHOR

Robert Hubley <rhubley@systemsbiology.org>

=cut

#
# Module Dependence
#
use strict;
use FindBin;
use lib $FindBin::RealBin;
use Data::Dumper;
use Pod::Text;
use Carp;
use Getopt::Long;

# RepeatMasker Libraries
use RepeatMaskerConfig;
use SeqDBI;
use FastaDB;
use WUBlastXSearchEngine;
use NCBIBlastXSearchEngine;
use CrossmatchSearchEngine;
use TRF;
use TRFSearchResult;
use SearchEngineI;

#
# Class Globals & Constants
#
my $CLASS = "RepeatProteinMask";
my $DEBUG = 0;
$DEBUG = 1 if ( $RepeatMaskerConfig::DEBUGALL == 1 );
my $version = $RepeatMaskerConfig::VERSION;

#
# Option processing
#  e.g.
#   -t: Single letter binary option
#   -t=s: String parameters
#   -t=i: Number paramters
#
my @opts =
    qw( help consensi=s wordsize=s pvalue=s maxaadist=s nolowsimple minscore=s querystatlen=s engine=s );
# Add configuration parameters as additional command-line options
push @opts, RepeatMaskerConfig::getCommandLineOptions();

#
# Provide the POD text from this file and 
# from the config file by merging them 
# together.  The heading "CONFIGURATION
# OVERRIDES" provides the insertion point
# for the configuration POD.
#
sub usage {
  my $p = Pod::Text->new();
  $p->output_fh(*STDOUT);
  my $pod_str;
  open IN,"<$0" or die "Could not open self ($0) for generating documentation!";
  while (<IN>){
    if ( /^=head1\s+CONFIGURATION OVERRIDES\s*$/ )
    { 
      my $c_pod = RepeatMaskerConfig::getPOD();
      if ( $c_pod ) {
        $pod_str .= $_ . $c_pod;
      }
    }else {
      $pod_str .= $_;
    }
  }
  close IN;
  print "$0 - $version\n";
  $p->parse_string_document($pod_str);
  exit(1);
}

## Pvalue default used to be 0.0001 now it doesn't filter by default

#
# Get the supplied command line options, and set flags
#
my %options = ();
unless ( &GetOptions( \%options, @opts ) )
{
  usage();
}

# Print the internal POD documentation if something is missing
if ( $options{'help'} || !$#ARGV < 1 )
{
  usage();
}

#
# Resolve configuration settings using the following precedence: 
# command line first, then environment, followed by config
# file.
#
RepeatMaskerConfig::resolveConfiguration(\%options);
my $config = $RepeatMaskerConfig::configuration;
my $LIBDIR = $config->{'LIBDIR'}->{'value'};
my $NCBIBLASTX_PRGM = $config->{'RMBLAST_DIR'}->{'value'} . "/blastx";
my $NCBIBLASTDB_PRGM = $config->{'RMBLAST_DIR'}->{'value'} . "/makeblastdb";
my $ABBLASTX_PRGM = $config->{'ABBLAST_DIR'}->{'value'} . "/blastx";
my $XDFORMAT_PRGM = $config->{'ABBLAST_DIR'}->{'value'} . "/xdformat";
my $ABBLAST_DIR = $config->{'ABBLAST_DIR'}->{'value'};

# This will miss alot. But in GC rich DNA a lower score
# will cause false positives
my $minScore = 20;
if ( defined $options{'minscore'} )
{
  $minScore = $options{'minscore'};
}

my $maxAADist = 333;
if ( defined $options{'maxaadist'} )
{
  $maxAADist = $options{'maxaadist'};
}

#determines which matches above which P value will be ignored
#my $cutoffP = 0.0001;
#if ( defined $options{'pvalue'} ) {
#  $cutoffP = $options{'pvalue'};
#}

my $wordSize = 3;
if ( defined $options{'wordsize'} )
{
  $wordSize = $options{'wordsize'};
}

my $fastaFile = "";
my $fileDir   = "";
if ( -s $ARGV[ 0 ] )
{
  my @fileParts = File::Spec->splitpath( $ARGV[ 0 ] );
  $fileDir   = $fileParts[ 1 ];
  $fastaFile = $fileParts[ 2 ];
} else
{
  die $CLASS . ": Missing fasta file parameter!\n";
}

#
# Assume we want to place the results next to the original file
#
if ( $fileDir ne "." && $fileDir ne "" )
{
  chdir( $fileDir );
}

# open up the consensi database
#
if ( exists $options{'nolowsimple'} )
{
  print
      "Identifying Simple and Low Complexity Repeats...(masking turned off)\n";
} else
{
  print "Masking Simple Repeats...\n";
}
my $seqDB = FastaDB->new(
                          fileName    => $fastaFile,
                          openMode    => SeqDBI::ReadOnly,
                          maxIDLength => 50
);

if ( $seqDB->getSeqCount() <= 0 )
{
  die "\n\nSomething is wrong with the input sequence. Possibly a formatting "
      . "problem.  Please correct your sequence data and resubmit.\n";
}

my $scratchDir = "/tmp";

#
# RepeatMasker
#
system( "cp $fastaFile $fastaFile.rmsimple" );
`$FindBin::RealBin/RepeatMasker -qq -noint -no_is $fastaFile.rmsimple > /dev/null 2>&1`;
my $RMResults;
if ( -e "$fastaFile.rmsimple.out" && -e "$fastaFile.rmsimple.masked" )
{
  $RMResults =
      CrossmatchSearchEngine::parseOutput(
                                    searchOutput => "$fastaFile.rmsimple.out" );
  print "   - Tandem Repeats: " . $RMResults->size() . "\n";
  system( "mv $fastaFile.rmsimple.masked $fastaFile.rmsimple" );
} else
{
  print "   - Tandem Repeats: 0\n";
}

unlink( "$fastaFile.rmsimple.out" ) if ( -e "$fastaFile.rmsimple.out" );
unlink( "$fastaFile.rmsimple.cat" ) if ( -e "$fastaFile.rmsimple.cat" );
unlink( "$fastaFile.rmsimple.tbl" ) if ( -e "$fastaFile.rmsimple.tbl" );
unlink( "$fastaFile.rmsimple.log" ) if ( -e "$fastaFile.rmsimple.log" );

#
# Comparison against database of transposon proteins
#
# blastx the simple-masked consensi vs the transposable element
# protein database with the fasta line format
#      >TWIFBIG#DNA/HAT-Ac
#
# This name may be *immediately* followed by #ReverseORF to indicate
# that the product is encoded on the opposite strand of the
# transposable element. It needs to be right after it, otherwise it
# may fall on the next line in the blastx output's hit description.
#
print "Masking Repeat Proteins...\n";

# Set the environment
my $searchEngineX;

#
# Parameter Translations:
#   WU/ABBlastX       NCBIBlastX
#   -------------     -----------------
#   -hspsepqmax #     -max_intron_length #
#   -W #              -word_size #
#   -Y #              -searchsp #
#   -T #              -threshold #
#
my $queryStatLen = 1000000;
if ( defined $options{'querystatlen'} )
{
  $queryStatLen = $options{'querystatlen'};
}

if ( $options{'engine'} =~ /ncbi|rmblast/i )
{
  $searchEngineX =
      NCBIBlastXSearchEngine->new(
                         pathToEngine => $NCBIBLASTX_PRGM );

  if ( !-s "$LIBDIR/RepeatPeps.lib.psq" )
  {
    system(   "$NCBIBLASTDB_PRGM -dbtype prot -in "
            . "$LIBDIR/RepeatPeps.lib "
            . "> /dev/null 2>&1" );
  }

  my $additionalParams =
        " -word_size $wordSize -max_intron_length $maxAADist"
      . " -searchsp $queryStatLen -threshold $queryStatLen ";
  $searchEngineX->setAdditionalParameters( $additionalParams );
} else
{
  $ENV{BLASTMAT} = "$ABBLAST_DIR/matrix";
  $searchEngineX =
      WUBlastXSearchEngine->new(
                    pathToEngine => $ABBLASTX_PRGM );

  if ( !-s "$LIBDIR/RepeatPeps.lib.xps" )
  {
    system( "$XDFORMAT_PRGM -p $LIBDIR/RepeatPeps.lib" );
  }

  my $additionalParams =
      " -W=$wordSize hspsepqmax=$maxAADist" . " -Y=$queryStatLen ";
  $searchEngineX->setAdditionalParameters( $additionalParams );
}

$searchEngineX->setQuery( "$fastaFile.rmsimple" );
$searchEngineX->setSubject( "$LIBDIR/RepeatPeps.lib" );
$searchEngineX->setFilterWords( 1 );
if ( defined $options{'pvalue'} )
{
  $searchEngineX->setPValueCutoff( $options{'pvalue'} );
}
if ( $minScore ne "" )
{
  $searchEngineX->setMinScore( $minScore );
}
$searchEngineX->setScoreMode( SearchEngineI::complexityAdjustedScoreMode );
$searchEngineX->setMaskLevel( 85 );
my ( $status, $resultCollection ) = $searchEngineX->search();

print "   - Protein Hits = " . $resultCollection->size() . "\n";
if ( $resultCollection->size() > 0 )
{
  if ( exists $options{'nolowsimple'} )
  {
    unlink( "$fastaFile.rmsimple" );
    maskResults( resultsRef => $resultCollection,
                 fastaFile  => $fastaFile );
  } else
  {
    maskResults( resultsRef => $resultCollection,
                 fastaFile  => "$fastaFile.rmsimple" );
    unlink( "$fastaFile.rmsimple" );
    system( "mv $fastaFile.rmsimple.masked $fastaFile.masked" );
  }
} else
{
  if ( exists $options{'nolowsimple'} )
  {
    system( "cp $fastaFile $fastaFile.masked" );
  } else
  {
    system( "mv $fastaFile.rmsimple $fastaFile.masked" );
  }
}

# Sort through results and create annotation file.
open ANO, ">$fastaFile.annot";
my $hdrStr = sprintf(
"%-10.10s %6s %-14.14s %-18.18s %-8.8s %-8.8s %1s %-18.18s %-18.18s %-8.8s %-8.8s\n",
  "pValue", "Score",  "Method", "SeqID", "Begin", "End",
  " ",      "Repeat", "Type",   "Begin", "End"
);
print ANO $hdrStr;
unless ( exists $options{'nolowsimple'}
         || !defined $RMResults )
{
  $resultCollection->addAll( $RMResults );
  $resultCollection->sort(
    sub ($$) {
      $_[ 0 ]->getQueryName() cmp $_[ 1 ]->getQueryName()
          || $_[ 0 ]->getQueryStart() <=> $_[ 1 ]->getQueryStart();
    }
  );
}

if ( $resultCollection->size() > 0 )
{
  my $prevQueryName = $resultCollection->get( 0 )->getQueryName();
  for ( my $k = 0 ; $k < $resultCollection->size() ; $k++ )
  {
    my $result = $resultCollection->get( $k );
    my $orient = "+";
    $orient = "-" if ( $result->getOrientation() eq "C" );

    # Break up the subject name into repeat name and repeat type
    my $subjName = $result->getSubjName();
    my $subjType = $result->getSubjType();
    if ( $result->getSubjName() =~ /(\S+)\#(\S+)/ )
    {
      $subjName = $1;
      $subjType = $2;
    }

    my $outStr;
    if ( !$result->getPValue() =~ /[\d\.\-\e]+/ )
    {

      # Result came from RepeatMasker
      $outStr = sprintf(
         "%-10.2s %6d %-14s %-18.18s %-8d %-8d %1s %-15.15s %-15.15s %8d %8d\n",
         "-",                      $result->getScore(),
         "RMasker/TRF",            $result->getQueryName(),
         $result->getQueryStart(), $result->getQueryEnd(),
         $orient,                  $subjName,
         $subjType,                $result->getSubjStart(),
         $result->getSubjEnd()
      );

    } else
    {
      $outStr = sprintf(
         "%-10.2e %6d %-14s %-18.18s %-8d %-8d %1s %-15.15s %-15.15s %8d %8d\n",
         $result->getPValue(),     $result->getScore(),
         "WUBlastX",               $result->getQueryName(),
         $result->getQueryStart(), $result->getQueryEnd(),
         $orient,                  $subjName,
         $subjType,                $result->getSubjStart(),
         $result->getSubjEnd()
      );

    }
    print ANO "$outStr";

  }
}
close ANO;

# Cya!
print "Done!\n";
exit;

#-------------------- S U B R O U T I N E S ------------------------------#

##---------------------------------------------------------------------##
##
##  maskResults()
##
##  Use: maskDatabase( resultsRef => #ref#,
##                     fastaFile => "/jo/bob/seq.fa",
##                   );
##
##
##
##---------------------------------------------------------------------##
sub maskResults
{
  my %parameters = @_;

  # Parameter checking
  die $CLASS
      . "::maskResults(): Missing or invalid resultsRef "
      . "parameter!\n"
      if ( !defined $parameters{'resultsRef'} );
  my $resultsRef = $parameters{'resultsRef'};

  die $CLASS . "::maskResults(): Missing fastaFile parameter!\n"
      if (    !defined $parameters{'fastaFile'}
           || !-s $parameters{'fastaFile'} );
  my $fastaFile = $parameters{'fastaFile'};

  my %maskRanges = ();
  my $maskDB = FastaDB->new( fileName => $fastaFile,
                             openMode => SeqDBI::ReadOnly );
  open OUT, ">$fastaFile.masked";

  for ( my $k = 0 ; $k < $resultsRef->size() ; $k++ )
  {
    my $result = $resultsRef->get( $k );
    push @{ $maskRanges{ $result->getQueryName() } },
        [
          $result->getQueryStart() - 1,
          $result->getQueryEnd() - $result->getQueryStart() + 1
        ];
  }

  my %idsSeen = ();
  foreach my $idKey ( keys( %maskRanges ) )
  {
    $idsSeen{$idKey} = 1;
    print OUT ">" . $idKey . " " . $maskDB->getDescription( $idKey ) . "\n";
    my $seq = $maskDB->getSequence( $idKey );
    foreach my $range ( @{ $maskRanges{$idKey} } )
    {

      #print " Masking seq: " .  $range->[ 0 ] . " - " .  $range->[ 1 ] . "\n";
      substr( $seq, $range->[ 0 ], $range->[ 1 ] ) = "N" x $range->[ 1 ];
    }
    $seq =~ s/(.{50})/$1\n/g;
    print OUT "$seq\n";
  }

  # Write out any records which didn't have any masking
  foreach my $idKey ( $maskDB->getIDs() )
  {
    next if ( exists $idsSeen{$idKey} );
    print OUT ">" . $idKey . " " . $maskDB->getDescription( $idKey ) . "\n";
    my $seq = $maskDB->getSequence( $idKey );
    $seq =~ s/(.{50})/$1\n/g;
    print OUT "$seq\n";
  }
  close OUT;

}

########################################################################################
########################################################################################
########################################################################################
########################################################################################

#sub TRFMask {
#  my $seqDB      = shift;
#  my $trfObj     = shift;
#  my $maskFile   = shift;
#  my $scratchDir = shift;
#
#  open OUT, ">$maskFile"
#      || die $CLASS . "::TRFMask: Could not open file $maskFile!\n";
#
#  # Create a tempDirectory for us
#  my $tmpDir;
#  do {
#    $tmpDir = $scratchDir . "/trfResults-" . time();
#  } while ( -d $tmpDir );
#  mkdir( $tmpDir );
#  $scratchDir = $tmpDir;
#
#  # Foreach sequence
#  my $repeatsMasked = 0;
#  my %allResults    = ();
#  foreach my $seqID ( $seqDB->getIDs() ) {
#
#    print OUT ">" . $seqID . " " . $seqDB->getDescription( $seqID ) . "\n";
#    my $seqLen = $seqDB->getSeqLength( $seqID );
#
#    # TODO: Make this capable of batching small sequences!
#    # Break into 5mb pieces...NOTE: Must keep track of seqID
#    for ( my $i = 0 ; $i < $seqLen ; $i += 5000000 ) {
#      my $batchSeq;
#
#      # Create temp seq file
#      open TMPFILE, ">$scratchDir/tmpseq.fa"
#          || die $CLASS
#          . ": Could not open "
#          . "temporary file $scratchDir/tmpseq.fa for output!\n";
#      print TMPFILE ">seq1\n";
#      if ( $i + 5000000 > $seqLen ) {
#        $batchSeq = $seqDB->getSubstr( $seqID, $i );
#      }
#      else {
#        $batchSeq = $seqDB->getSubstr( $seqID, $i, 5000000 );
#      }
#      print TMPFILE "$batchSeq\n";
#      close TMPFILE;
#
#      # Run TRF
#      my ( $retCode, $trfResults ) = $trf->search( sequenceFile => "$scratchDir/tmpseq.fa",
#                                                  workDir      => $scratchDir );
#
#      print $CLASS. ": TRF Returned " . $trfResults->size() . " results\n"
#      ;
#      #if ( $DEBUG );
#
#      for ( my $j = 0; $j < $trfResults->size(); $j++ )
#      {
#        my $result = $trfResults->get($j);
#        bless $result, "TRFSearchResult";
#
#        # TODO: Document why?
#        if (    $result->getCopyNumber() > 4
#             && $result->getPeriod() > 1 )
#        {
#          # Mask
#          my $start = $result->getQueryStart() - 1;
#          my $len   = $result->getQueryEnd() - $start;
#
#          #print "Masking: ".$result->toString()."\n";
#          substr( $batchSeq, $start, $len ) = "N" x $len;
#          $repeatsMasked++;
#          push @{ $allResults{$seqID} }, $result;
#        }
#      }
#
#      # write chunk out
#      $batchSeq =~ s/(.{50})/$1\n/g;
#      print OUT "$batchSeq\n";
#
#    }
#  }
#  close OUT;
#  unlink( "$scratchDir/tmpseq.fa" ) if ( -e "$scratchDir/tmpseq.fa" );
#
#  system( "rm -rf $scratchDir" );
#
#  #print "   $repeatsMasked Tandem Repeats Masked\n";
#
#  return ( $repeatsMasked, \%allResults );
#}

1;
